{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lightning Tour\n",
    "\n",
    "This is a tutorial notebook of the [Lightning Tour](https://spacy.io/docs/usage/lightning-tour) page.\n",
    "\n",
    "The following examples and code snippets give you an overview of spaCy's functionality and its usage.\n",
    "\n",
    "### Load resources and process text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import spacy\n",
    "# this is to make sure we get no unicode based errors\n",
    "from __future__ import unicode_literals\n",
    "\n",
    "en_nlp = spacy.load('en')\n",
    "de_nlp = spacy.load('de')\n",
    "en_doc = en_nlp(u'Hello, world. Here are two sentences.')\n",
    "de_doc = de_nlp(u'ich bin ein Berliner.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multi-threaded generator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "texts = [u'One document.', u'...', u'Lots of documents']\n",
    "# .pipe streams input, and produces streaming output\n",
    "iter_texts = (texts[i % 3] for i in xrange(100000000))\n",
    "for i, doc in enumerate(en_nlp.pipe(iter_texts, batch_size=50, n_threads=4)):\n",
    "    assert doc.is_parsed\n",
    "    if i == 100:\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get tokens and sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "token = en_doc[0]\n",
    "sentence = next(en_doc.sents)\n",
    "assert token is sentence[0]\n",
    "assert sentence.text == 'Hello, world.'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use integer IDs for any string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "hello_id = en_nlp.vocab.strings['Hello']\n",
    "hello_str = en_nlp.vocab.strings[hello_id]\n",
    "\n",
    "assert token.orth  == hello_id  == 6747\n",
    "assert token.orth_ == hello_str == 'Hello'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get and set string views and flags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "assert token.shape_ == u\"Xxxxx\"\n",
    "for lexeme in en_nlp.vocab:\n",
    "    if lexeme.is_alpha:\n",
    "        lexeme.shape_ = 'W'\n",
    "    elif lexeme.is_digit:\n",
    "        lexeme.shape_ = 'D'\n",
    "    elif lexeme.is_punct:\n",
    "        lexeme.shape_ = 'P'\n",
    "    else:\n",
    "        lexeme.shape_ = 'M'\n",
    "assert token.shape_ == 'W'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Export to numpy arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from spacy.attrs import ORTH, LIKE_URL, IS_OOV\n",
    "\n",
    "attr_ids = [ORTH, LIKE_URL, IS_OOV]\n",
    "doc_array = en_doc.to_array(attr_ids)\n",
    "assert doc_array.shape == (len(en_doc), len(attr_ids))\n",
    "assert en_doc[0].orth == doc_array[0, 0]\n",
    "assert en_doc[1].orth == doc_array[1, 0]\n",
    "assert en_doc[0].like_url == doc_array[0, 1]\n",
    "assert list(doc_array[:, 1]) == [t.like_url for t in en_doc]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Word vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "doc = en_nlp(u\"Apples and oranges are similar. Boots and hippos aren't.\")\n",
    "\n",
    "apples = doc[0]\n",
    "oranges = doc[2]\n",
    "boots = doc[6]\n",
    "hippos = doc[8]\n",
    "\n",
    "assert apples.similarity(oranges) > boots.similarity(hippos)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Part-of-speech tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from spacy.parts_of_speech import ADV\n",
    "\n",
    "def is_adverb(token):\n",
    "    return token.pos == spacy.parts_of_speech.ADV\n",
    "\n",
    "# These are data-specific, so no constants are provided. You have to look\n",
    "# up the IDs from the StringStore.\n",
    "NNS = en_nlp.vocab.strings['NNS']\n",
    "NNPS = en_nlp.vocab.strings['NNPS']\n",
    "def is_plural_noun(token):\n",
    "    return token.tag == NNS or token.tag == NNPS\n",
    "\n",
    "def print_coarse_pos(token):\n",
    "    print(token.pos_)\n",
    "\n",
    "def print_fine_pos(token):\n",
    "    print(token.tag_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Syntactic dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def dependency_labels_to_root(token):\n",
    "    '''Walk up the syntactic tree, collecting the arc labels.'''\n",
    "    dep_labels = []\n",
    "    while token.head is not token:\n",
    "        dep_labels.append(token.dep)\n",
    "        token = token.head\n",
    "    return dep_labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Named entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def iter_products(docs):\n",
    "    for doc in docs:\n",
    "        for ent in doc.ents:\n",
    "            if ent.label_ == 'PRODUCT':\n",
    "                yield ent\n",
    "\n",
    "def word_is_in_entity(word):\n",
    "    return word.ent_type != 0\n",
    "\n",
    "def count_parent_verb_by_person(docs):\n",
    "    counts = defaultdict(lambda: defaultdict(int))\n",
    "    for doc in docs:\n",
    "        for ent in doc.ents:\n",
    "            if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:\n",
    "                counts[ent.orth_][ent.root.head.lemma_] += 1\n",
    "    return counts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculate inline mark-up on original string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def put_spans_around_tokens(doc, get_classes):\n",
    "    '''Given some function to compute class names, put each token in a\n",
    "    span element, with the appropriate classes computed.\n",
    "\n",
    "    All whitespace is preserved, outside of the spans. (Yes, I know HTML\n",
    "    won't display it. But the point is no information is lost, so you can\n",
    "    calculate what you need, e.g.  tags,  tags, etc.)\n",
    "    '''\n",
    "    output = []\n",
    "    template = '{word}{space}'\n",
    "    for token in doc:\n",
    "        if token.is_space:\n",
    "            output.append(token.orth_)\n",
    "        else:\n",
    "            output.append(\n",
    "              template.format(\n",
    "                classes=' '.join(get_classes(token)),\n",
    "                word=token.orth_,\n",
    "                space=token.whitespace_))\n",
    "    string = ''.join(output)\n",
    "    string = string.replace('\\n', '')\n",
    "    string = string.replace('\\t', '    ')\n",
    "    return string"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Efficient binary serialization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from spacy.tokens.doc import Doc\n",
    "\n",
    "byte_string = doc.to_bytes()\n",
    "open('moby_dick.bin', 'wb').write(byte_string)\n",
    "\n",
    "nlp = spacy.load('en')\n",
    "for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')):\n",
    "   doc = Doc(nlp.vocab)\n",
    "   doc.from_bytes(byte_string)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
